*******************************************************
* This file uses IDAPALL raw data to define workers'
* occupational layer
*******************************************************

clear all
set more off

*cd "Y:\data\workdata\702069\FAPP2069\"
cd "${data_dir}"

* Start from the raw IDA data:
*************************************************
use pnr aar lbnr cvrnr joblon timelon sektor DISCOALLE_INDK DISCO08_ALLE_INDK ///
type hfaudd koen alder using "idapall.dta", clear

keep if aar == "1999" | aar == "2000" | aar == "2001" | aar == "2002" | aar == "2003" | ///
aar == "2004" | aar == "2005" | aar == "2006" | aar == "2007" | aar == "2008"

* add firm identifiers:
merge 1:1 aar pnr lbnr using fida.dta
keep if _merge == 3
drop _merge

tab aar
bys aar cvrnr: gen eval = _n==1
tab aar if eval == 1

* Add other worker characteristics

* Mapping from degrees to education levels and years of schooling
rename hfaudd audd
merge n:1 audd using "audd_educ.dta"
drop if _merge == 2
drop _merge

gen educ = 1 if h1 == "10" | h1 == "15"
replace educ = 2 if h1 == "20" | h1 == "25"
replace educ = 3 if h1 == "30" | h1 == "35" | h1 == "39"
replace educ = 4 if h1 == "40" | h1 == "50" | h1 == "60"
replace educ = 5 if h1 == "65" | h1 == "70"

* Define education labels 
label define educlabel ///
1 "Primary Education" ///
2 "High School" ///
3 "Vocational Training" ///
4 "Bachelor Degree" ///
5 "Master Degree or PhD"
 
label values educ educlabel

* Data on labor market history: tenure and experience by industry
merge n:1 aar pnr cvrnr using "experience_tenure.dta"
drop if _merge == 2
drop _merge
keep if aar == "1999" | aar == "2000" | aar == "2001" | aar == "2002" | aar == "2003" | ///
aar == "2004" | aar == "2005" | aar == "2006" | aar == "2007" | aar == "2008"

destring alder, gen(age)
destring koen, gen(gender)

* save broader worker data:
preserve
keep pnr aar lbnr cvrnr age gender experience tenure educ type
save workerdata_full.dta, replace
restore

* only keep active workers (in November)
*****************************************
drop if timelon == . | timelon == 0
drop if joblon == . | joblon == 0

* Construct Layers
************************
* generate occupation variable
rename DISCOALLE_INDK funk
destring funk, gen(funkn) force
tabstat funkn, stats(min max)
* aggregate military occupations:
replace funkn = 900 if funk == "0110" | funk == "011000" 
tab aar if funk == "0110" | funk == "011000" 

gen order2 = 1 if (funkn >=0 & funkn <=9)
replace order2 = 2 if (funkn >=10 & funkn <=99)
replace order2 = 3 if (funkn >=100 & funkn <=999)
replace order2 = 4 if (funkn >=1000 & funkn <=9999)
replace order2 = 5 if (funkn >=10000 & funkn <=99999)
replace order2 = 6 if (funkn >=100000 & funkn <=999999)

* All workers with missing occupation have order2 == 1
tab aar order2

* Harmonize to 3 digits for match with layer correspondence
cap drop funk3n
gen funk3n = funkn * 100 if order2 == 1
replace funk3n = funkn * 10 if order2 == 2
replace funk3n = funkn if order2 == 3
replace funk3n = int(funkn / 10) if order2 == 4
replace funk3n = int(funkn / 100) if order2 == 5
replace funk3n = int(funkn / 1000) if order2 == 6

* layer mapping following Caliendo et al:
*******************************************

* Find corresponding PCS category for 3-digit code
sort funk3n
merge m:1 funk3n using correspondence_DISCO3_pcslayer
drop if _merge == 2
rename _merge pcs_merged

* By firm, how many hours have to be imputed because of missing layer?
set more off
gen hours = joblon / timelon
gen implayer = hours * (pcslayer == .)
bys aar cvrnr: egen totalimplayer = total(implayer)
by aar cvrnr: egen totalh = total(hours) 
gen implayershare = (totalimplayer/totalh)
cap drop eval
bys cvrnr aar: gen eval = _n==1
sum implayershare if eval == 1, det

* Recover based on previous year's and/or next year's layer at the same firm:
sort pnr cvrnr aar
by pnr cvrnr: replace pcslayer = pcslayer[_n+1] if pcslayer == . ///
& pcslayer[_n-1]==pcslayer[_n+1] & pcslayer[_n+1] != . & pcslayer == .

by pnr cvrnr: replace pcslayer = pcslayer[_n-1] if pcslayer == . & pcslayer[_n-1] != .
by pnr cvrnr: replace pcslayer = pcslayer[_n+1] if pcslayer == . & pcslayer[_n+1] != .

by pnr cvrnr: replace pcslayer = pcslayer[_n-2] if pcslayer == . & pcslayer[_n-2] != .
by pnr cvrnr: replace pcslayer = pcslayer[_n+2] if pcslayer == . & pcslayer[_n+2] != .

* By firm, how many hours have to be imputed?
gen implayer2 = hours * (pcslayer == .)
bys aar cvrnr: egen totalimplayer2 = total(implayer2) 
gen implayershare2 = (totalimplayer2/totalh)
cap drop eval
bys cvrnr aar: gen eval = _n==1
sum implayershare2 if eval == 1, det

* Cutoff for main sample: 20% imputed layers!
sum implayershare2 if eval == 1 & implayershare2 <= 0.2, det

* impute remaining missing layer data from distance to layer wages:
forvalues x = 1/4{
bys aar cvrnr: egen helpw`x' = total(joblon) if pcslayer == `x'
by aar cvrnr: egen layerw`x' = max(helpw`x')
by aar cvrnr: egen helph`x' = total(hours) if pcslayer == `x'
by aar cvrnr: egen layerh`x' = max(helph`x')
gen nw`x' = layerw`x'/layerh`x'
}

forvalues x = 1/4{
gen diff`x' = abs(timelon - nw`x')
}
egen mindiff = rowmin(diff*)
forvalues x = 1/4{
replace pcslayer = `x' if diff`x' == mindiff & pcslayer == .
}

rename pcslayer layer
replace layer = . if implayershare2 > 0.2
drop pcs_merged order2 help* layerw* layerh* nw* diff* mindiff totalimplayer*

sort aar cvrnr pnr
save "workerdata_layer.dta", replace

log close





